2.1. Incomplete dataset
load("/Volumes/LGBT Project data/Multiple Imputation/d_2010_incomplete.RData")
summary( d_2010_incomplete )
sampling_strata_region calibrated_weight no.of.population sexual_identity_2010 sexual_identity_fluidity
Lidingö : 1368 Min. : 5.424 Min. : 6831 Heterosexual:28471 0 :11534
Nacka : 1342 1st Qu.: 25.180 1st Qu.: 26849 Homosexual : 361 1 : 1656
Nynäshamn : 1284 Median : 43.971 Median : 34342 Bisexual : 381 NA's:17577
Upplands-Väsby: 1268 Mean : 52.157 Mean : 41143 Uncertain : 394
Sundbyberg : 1144 3rd Qu.: 70.027 3rd Qu.: 56341 NA's : 1160
Botkyrka : 1082 Max. :282.667 Max. :101916
(Other) :23279
age sex country_of_birth education income marital_status
Min. : 18.00 Male :13829 Sweden :25157 <=9 years : 4998 Min. : 0 Never married : 9887
1st Qu.: 37.00 Female:16938 Europe : 3343 10-12 years:12548 1st Qu.: 1917 Currently married:15066
Median : 51.00 Outside Europe: 2267 >=13 years :13001 Median : 2696 Other : 5814
Mean : 51.04 NA's : 220 Mean : 3247
3rd Qu.: 65.00 3rd Qu.: 3667
Max. :104.00 Max. :450719
living_alone personal_support weight_strata
yes : 6432 yes :26882 2 : 1238
no :24005 no : 3474 16 : 1235
NA's: 330 NA's: 411 8 : 1232
11 : 1232
14 : 1232
18 : 1232
(Other):23366
sapply( d_2010_incomplete, class ) # all continuous variables are numeric, and all categorical variables are factor
sampling_strata_region calibrated_weight no.of.population sexual_identity_2010
"factor" "numeric" "numeric" "factor"
sexual_identity_fluidity age sex country_of_birth
"factor" "numeric" "factor" "factor"
education income marital_status living_alone
"factor" "numeric" "factor" "factor"
personal_support weight_strata
"factor" "factor"
miss_var_summary( d_2010_incomplete ) # 57.1% missing in sexual_identity_fluidity, 3.8% in sexual_identity_2010, 1.3% in personal_support, 1.1% in living_alone, and 0.7% in education
2.2. Two-level multivariate normal imputation
# specify imputation model
# fml_imp_2010 <- sexual_identity_fluidity + sexual_identity_2010 + personal_support + living_alone + education ~ 1 + age*sex + country_of_birth + income + marital_status + ( 1 | weight_strata )
# final imputation with the chosen number of iterations
# imp_final_2010 <- jomoImpute( data = d_2010_incomplete,
# formula = fml_imp_2010,
# random.L1 = "full",
# n.burn = 2000,
# n.iter = 1000,
# m = 80,
# seed = 12345
# ) # took around 20 hours
load("/Volumes/LGBT Project data/Multiple Imputation/imp_final_2010.RData")
summary( imp_final_2010 ) # summarize model and display convergence statistics
Call:
jomoImpute(data = d_2010_incomplete, formula = fml_imp_2010,
random.L1 = "full", n.burn = 2000, n.iter = 1000, m = 80,
seed = 12345)
Cluster variable: weight_strata
Target variables: sexual_identity_fluidity sexual_identity_2010 personal_support living_alone education
Fixed effect predictors: (Intercept) age sex country_of_birth income marital_status age:sex
Random effect predictors: (Intercept)
Performed 2000 burn-in iterations, and generated 80 imputed data sets,
each 1000 iterations apart.
Potential scale reduction (Rhat, imputation phase):
Min 25% Mean Median 75% Max
Beta: 1.000 1.000 1.004 1.001 1.004 1.028
Psi: 1.000 1.000 1.000 1.000 1.000 1.001
Sigma: 1.000 1.000 1.073 1.012 1.056 1.989
Largest potential scale reduction:
Beta: [1,2], Psi: [2,1], Sigma: [42,1]
Missing data per variable:
weight_strata sexual_identity_fluidity sexual_identity_2010 personal_support living_alone education
MD% 0 57.1 3.8 1.3 1.1 0.7
sampling_strata_region calibrated_weight no.of.population age sex country_of_birth income marital_status
MD% 0 0 0 0 0 0 0 0
plot( imp_final_2010, trace = "all", print = "beta" ) # check trace and autocorrelation plots

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

NA

2.3. Validate imputed data
# extract imputed datasets
original_data_2010 <- mitmlComplete( imp_final_2010, print = 0 ) # extract original incomplete dataset
implist_2010 <- mitmlComplete( imp_final_2010, print = "all" ) # extract all imputed datasets
original_data_2010$imputation <- "0"
all_data_2010 <- bind_rows( original_data_2010,
bind_rows( implist_2010, .id = "imputation" ) ) # merge datasets
all_data_2010$imputation <- as.numeric( all_data_2010$imputation )
summary( all_data_2010 )
weight_strata sexual_identity_fluidity sexual_identity_2010 personal_support living_alone education
2 : 100278 0 :2112580 Heterosexual:2391929 yes :2205638 yes : 529530 <=9 years : 409994
16 : 100035 1 : 361970 Homosexual : 30755 no : 286078 no :1962267 10-12 years:1023200
8 : 99792 NA's: 17577 Bisexual : 32530 NA's: 411 NA's: 330 >=13 years :1058713
11 : 99792 Uncertain : 35753 NA's : 220
14 : 99792 NA's : 1160
18 : 99792
(Other):1892646
sampling_strata_region calibrated_weight no.of.population age sex country_of_birth
Lidingö : 110808 Min. : 5.424 Min. : 6831 Min. : 18.00 Male :1120149 Sweden :2037717
Nacka : 108702 1st Qu.: 25.180 1st Qu.: 26849 1st Qu.: 37.00 Female:1371978 Europe : 270783
Nynäshamn : 104004 Median : 43.971 Median : 34342 Median : 51.00 Outside Europe: 183627
Upplands-Väsby: 102708 Mean : 52.157 Mean : 41143 Mean : 51.04
Sundbyberg : 92664 3rd Qu.: 70.027 3rd Qu.: 56341 3rd Qu.: 65.00
Botkyrka : 87642 Max. :282.667 Max. :101916 Max. :104.00
(Other) :1885599
income marital_status imputation
Min. : 0 Never married : 800847 Min. : 0
1st Qu.: 1917 Currently married:1220346 1st Qu.:20
Median : 2696 Other : 470934 Median :40
Mean : 3247 Mean :40
3rd Qu.: 3667 3rd Qu.:60
Max. :450719 Max. :80
# sexual identity in 2010
ggplot( all_data_2010[ !is.na( all_data_2010$sexual_identity_2010 ), ],
aes( fill = sexual_identity_2010, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Sexual identity in 2010" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# change in sexual identity during 2010-2021
ggplot( all_data_2010[ !is.na( all_data_2010$sexual_identity_fluidity ), ],
aes( fill = sexual_identity_fluidity, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Change in sexual identity during 2010-2021", labels = c( "No", "Yes" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# education
ggplot( all_data_2010[ !is.na( all_data_2010$education ), ],
aes( fill = education, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Level of education" ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# living status
ggplot( all_data_2010[ !is.na( all_data_2010$living_alone ), ],
aes( fill = living_alone, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Living alone", labels = c( "Yes", "No" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)

# personal support
ggplot( all_data_2010[ !is.na( all_data_2010$personal_support ), ],
aes( fill = personal_support, x = imputation ) ) +
geom_bar( position = "fill" ) +
scale_y_continuous( labels = scales::percent ) +
scale_fill_discrete( name = "Personal support", labels = c( "Yes", "No" ) ) +
labs(
x = "Imputation number",
y = "Proportion",
caption = "Notes: Imputation number 0 represents the original incomplete dataset." ) +
theme_classic() +
theme( axis.title.x = element_text( family = "Arial", size = 11 ),
axis.text.x = element_text( family = "Arial", size = 11 ),
axis.text.y = element_text( family = "Arial", size = 11 ),
axis.title.y = element_text( family = "Arial", size = 11 ),
legend.text = element_text( family = "Arial", size = 10 ),
legend.title = element_text( family = "Arial", size = 10 ),
legend.position = "bottom",
plot.caption = element_text( family = "Arial", size = 10, hjust = 0 )
)
